Daten Importieren und vorbereiten¶

In [ ]:
# import functions
import pandas as pd

# Read enriched dataset
data_enriched = pd.read_csv("data_enriched.csv", parse_dates=['Zeitstempel'])
In [ ]:
# Zeitraum der Daten
print(data_enriched['Zeitstempel'].min())
print(data_enriched['Zeitstempel'].max())

# Drop ArtikelNr und Zeitstempel
data_enriched = data_enriched.drop(['ArtikelNr', 'Zeitstempel', 'Menge_log'], axis=1)
2018-01-02 00:00:00
2022-02-22 00:00:00
In [ ]:
# Alle Artikel
all_articles = data_enriched['Artikel'].value_counts().index.tolist()
all_articles
Out[ ]:
['Laugen-Gipfel          of',
 'Butter-Gipfel / Croissant of',
 'caffè crema',
 'St. Galler Handbürli Culinarium *Gold prämiert*',
 'Körnergipfel of',
 'Weggli',
 'Mais-Gipfel of',
 'Sonnenblumenbrötli  of',
 'St. Galler Handb dk. Culinarium *Gold prämiert*',
 'Semmel                 of',
 'Nuss-Stengel Original',
 'Pain Roule rustico klein  of',
 'Gallusbrot 400   of',
 'Mais-Brötli süss   of',
 'Berliner m Confi Himbeer of',
 'Erdbeertörtli gross    ',
 'Butterzopf       440   2-teilig ',
 'Ziger-Krapfen          of']

Alle Modelle Fine Tunen¶

In [ ]:
# Funktionen importieren
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
In [ ]:
# Function to calculate the score
def calculate_score(result):
    # Calculate score
    score = (0.6 * result['R2'] * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * result['R2'])
    return score
In [ ]:
# Create dictionary for best_models
best_models = {}
# Dateframe for all metrics
all_results = pd.DataFrame()

# Define hyperparameters for random search
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 50],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 50],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.5, 1.0],
    'colsample_bytree': [0.4, 1.0],
    'min_child_weight': [1, 4]
}

# Loop over articles
for artikel in all_articles:

    # Filter by Artikel
    data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)

    X = data_filtered.drop(['Menge'], axis=1)
    y = data_filtered['Menge']

    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)

    models = [
        ('LinearRegression', LinearRegression(), {}),
        ('RandomForestRegressor', RandomForestRegressor(), rf_params),
        ('XGBRegressor', XGBRegressor(), xgb_params)
    ]

    # Initialize a variable to keep track of the best RMSE and corresponding model
    best_rmse = float('inf')
    best_model_info = None

    for name, model, params in models:

        if params:
            # Hyperparameter tuning using RandomizedSearchCV
            model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
            model_cv.fit(X_train, y_train)
            best_model = model_cv.best_estimator_
        else:
            best_model = model
            best_model.fit(X_train, y_train)

        # Predict y
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Save results in a dictionary
        result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}

        # Calculate score
        result['score'] = calculate_score(result)

        # Add to results dataframe
        all_results = pd.concat([all_results, pd.DataFrame([result])], ignore_index=True)    

        # If it's the best model so far, save it
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_info = {'model': best_model, 'metrics': result}
    
    # Save the best model for this artikel
    best_models[artikel] = best_model_info
In [ ]:
import pickle

# Saving the objects:
with open('best_models.pkl', 'wb') as f:  
    pickle.dump(best_models, f)

Plot Results¶

Our Score¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('Score')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

R2¶

In [ ]:
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

RMSE¶

In [ ]:
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

Variable Importance¶

In [ ]:
for artikel, models in best_models.items():
    print(artikel)
    print(models)
Laugen-Gipfel          of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=300), 'metrics': {'Artikel': 'Laugen-Gipfel          of', 'R2': 0.6627369296333037, 'RMSE': 4.87800554001229, 'MAPE': 0.2148137293918103, 'modelname': 'RandomForestRegressor', 'score': 0.22874662415555236}}
Butter-Gipfel / Croissant of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Butter-Gipfel / Croissant of', 'R2': 0.7066574089100415, 'RMSE': 17.982623591987302, 'MAPE': 0.20390633584918344, 'modelname': 'RandomForestRegressor', 'score': 0.16895581712240984}}
caffè crema
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=300), 'metrics': {'Artikel': 'caffè crema', 'R2': 0.6634038050016473, 'RMSE': 10.931616226414894, 'MAPE': 0.25385630688431504, 'modelname': 'RandomForestRegressor', 'score': 0.17350047772249583}}
St. Galler Handbürli Culinarium *Gold prämiert*
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=300), 'metrics': {'Artikel': 'St. Galler Handbürli Culinarium *Gold prämiert*', 'R2': 0.21921916977812095, 'RMSE': 7.938905773111951, 'MAPE': 0.48110396370560776, 'modelname': 'RandomForestRegressor', 'score': 0.06551311523215625}}
Körnergipfel of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Körnergipfel of', 'R2': 0.6140729229946368, 'RMSE': 3.8832691832362958, 'MAPE': 0.30771712266985146, 'modelname': 'RandomForestRegressor', 'score': 0.22415287945049006}}
Weggli
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Weggli', 'R2': 0.5221181270101496, 'RMSE': 3.3357022391004483, 'MAPE': 0.2205785840529178, 'modelname': 'RandomForestRegressor', 'score': 0.22435468771556372}}
Mais-Gipfel of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Mais-Gipfel of', 'R2': 0.5863327206561324, 'RMSE': 3.541304074415829, 'MAPE': 0.4789148935736349, 'modelname': 'RandomForestRegressor', 'score': 0.19846108832447573}}
Sonnenblumenbrötli  of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Sonnenblumenbrötli  of', 'R2': 0.3699374119375518, 'RMSE': 2.5375262070750257, 'MAPE': 0.48600412442625485, 'modelname': 'RandomForestRegressor', 'score': 0.15945929224855931}}
St. Galler Handb dk. Culinarium *Gold prämiert*
{'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.4, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=4, missing=nan, monotone_constraints=None,
             n_estimators=300, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...), 'metrics': {'Artikel': 'St. Galler Handb dk. Culinarium *Gold prämiert*', 'R2': 0.22528609136472566, 'RMSE': 6.286680467429014, 'MAPE': 0.5155542127318328, 'modelname': 'XGBRegressor', 'score': 0.07088523625953266}}
Semmel                 of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=300), 'metrics': {'Artikel': 'Semmel                 of', 'R2': 0.34121630754748355, 'RMSE': 3.2123947804646136, 'MAPE': 0.42367614697763484, 'modelname': 'RandomForestRegressor', 'score': 0.1408543335579437}}
Nuss-Stengel Original
{'model': LinearRegression(), 'metrics': {'Artikel': 'Nuss-Stengel Original', 'R2': -0.0021114790147238605, 'RMSE': 2.915950677427657, 'MAPE': 0.7041052864028683, 'modelname': 'LinearRegression', 'score': 0.019744052649939302}}
Pain Roule rustico klein  of
{'model': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Pain Roule rustico klein  of', 'R2': 0.8154665271885964, 'RMSE': 4.413756384641824, 'MAPE': 0.4852917215344433, 'modelname': 'RandomForestRegressor', 'score': 0.2434733826755387}}
Gallusbrot 400   of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Gallusbrot 400   of', 'R2': 0.3651120092669522, 'RMSE': 3.268461516816381, 'MAPE': 0.30427740262421493, 'modelname': 'RandomForestRegressor', 'score': 0.16222477527197693}}
Mais-Brötli süss   of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, n_estimators=200), 'metrics': {'Artikel': 'Mais-Brötli süss   of', 'R2': 0.33460189322392575, 'RMSE': 2.017214018376093, 'MAPE': 0.25776153039382643, 'modelname': 'RandomForestRegressor', 'score': 0.21438134684339533}}
Berliner m Confi Himbeer of
{'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=50, max_leaves=None,
             min_child_weight=4, missing=nan, monotone_constraints=None,
             n_estimators=300, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...), 'metrics': {'Artikel': 'Berliner m Confi Himbeer of', 'R2': 0.316613154674098, 'RMSE': 8.212453649924912, 'MAPE': 0.5094356651347668, 'modelname': 'XGBRegressor', 'score': 0.08661705037475946}}
Erdbeertörtli gross    
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Erdbeertörtli gross    ', 'R2': 0.33017165577975394, 'RMSE': 2.518308573781984, 'MAPE': 0.6012645844655415, 'modelname': 'RandomForestRegressor', 'score': 0.12906781489123598}}
Butterzopf       440   2-teilig 
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10), 'metrics': {'Artikel': 'Butterzopf       440   2-teilig ', 'R2': 0.7411862643597246, 'RMSE': 2.6896141238490627, 'MAPE': 0.39365310636856266, 'modelname': 'RandomForestRegressor', 'score': 0.2935811401820453}}
Ziger-Krapfen          of
{'model': RandomForestRegressor(max_depth=50, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=300), 'metrics': {'Artikel': 'Ziger-Krapfen          of', 'R2': 0.22389419263674926, 'RMSE': 8.802969741503238, 'MAPE': 0.4924556679328408, 'modelname': 'RandomForestRegressor', 'score': 0.06405535639230567}}
In [ ]:
from functions import plot_variable_importance

for artikel, models in best_models.items():
    model = models['model']
    model_name = models['metrics']['modelname']

    plot_variable_importance(model, X_train, model_name, artikel)

Try Voting Regressor with best models¶

In [ ]:
from sklearn.ensemble import VotingRegressor

# Create dictionary for best_models
best_models_voting = {}

# Dateframe for all metrics
all_results_voting = pd.DataFrame()

# Loop over articles
for artikel in all_articles:

    # Filter by Artikel
    data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)

    X = data_filtered.drop(['Menge'], axis=1)
    y = data_filtered['Menge']

    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)

    models = [
        ('LinearRegression', LinearRegression(), {}),
        ('RandomForestRegressor', RandomForestRegressor(), rf_params),
        ('XGBRegressor', XGBRegressor(), xgb_params)
    ]

    # Initialize a variable to keep track of the best RMSE and corresponding model
    best_rmse = float('inf')
    best_model_info = None

    # Initialize list for base estimators of VotingRegressor
    estimators = []

    for name, model, params in models:

        if params:
            # Hyperparameter tuning using RandomizedSearchCV
            model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
            model_cv.fit(X_train, y_train)
            best_model = model_cv.best_estimator_
        else:
            best_model = model
            best_model.fit(X_train, y_train)

        # Add the model to the estimators list
        estimators.append((name, best_model))

        # Predict y
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Save results in a dictionary
        result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}

        # Calculate score
        result['score'] = calculate_score(result)

        # Add to results dataframe
        all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)    

        # If it's the best model so far, save it
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_info = {'model': best_model, 'metrics': result}

    # Voting Regressor
    voting_regressor = VotingRegressor(estimators)
    voting_regressor.fit(X_train, y_train)

    # Predict y
    y_pred = voting_regressor.predict(X_test)

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Save results in a dictionary
    result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': 'VotingRegressor'}

    # Calculate score
    result['score'] = calculate_score(result)

    # Add to results dataframe
    all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)    
    
    # If it's the best model so far, save it
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_info = {'model': voting_regressor, 'metrics': result}

    # Save the best model and metrics for this artikel
    best_models_voting[artikel] = best_model_info
In [ ]:
import pickle

# Saving the objects:
with open('best_models_voting.pkl', 'wb') as f:  
    pickle.dump(best_models_voting, f)

Plot results¶

Our Score¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

R2¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

RMSE¶

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()

Plot Variable Importance¶

In [ ]:
from functions import plot_variable_importance

for artikel, models in best_models_voting.items():
    model = models['model']
    model_name = models['metrics']['modelname']

    plot_variable_importance(model, X_train, model_name, artikel)
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0